# blast-parser -----------------------------------------------------------
#' @include blastReport-class.r
#' @importFrom Biostrings BString
#' @importFrom XML xmlInternalTreeParse xmlRoot xpathApply xmlDoc xmlValue
NULL
#' Parse NCBI BLAST XML files.
#'
#' @param x BLAST output in XML format (as file path or character string).
#' @param asText If \code{TRUE} the XML BLAST output is passed in as a string
#' instead of a file path.
#' @return A \code{\linkS4class{BlastReport}} object.
#' @export
#' @examples
#' hla.xml <- system.file("extdata", "HLA-B-UTR.blast.xml", package = "blastr")
#' x <- blastReport(hla.xml)
#'
#' ## look at the Header
#' getHeader(x)
#'
#' ## look at the BLAST parameters
#' getParams(x)
#'
#' ## look at the results of the first query
#' getQuery(x, 1)
#'
#' ## or easier
#' x[[1]]
#'
#' ## access the first two hits in the second query
#' getHit(getQuery(x, 2), 1:2)
#'
#' ## for better readability use piping
#' \dontrun{
#' library("magrittr")
#' x %>% getQuery(2) %>% getHit(1:2)
#' }
#'
#' ## or simple subsetting
#' x[[2]][1:2]
#'
#' ## it's also possible to access the two best hits for all queries at once
#' getHit(x, 1:2)
#'
#' ## access the hit ranges
#' getHitRange(x[[2]][1:2])
#'
#' ## access evalues
#' getEvalue(x[[2]][1:2])
#'
blastReport <- function(x, asText = FALSE) {
doc <- xmlRoot(xmlInternalTreeParse(x, asText = asText))
dbr <- parseBlastDatabaseReport(doc)
query_elems <- xpathApply(doc, '/BlastOutput/BlastOutput_iterations/Iteration')
new_BlastReport(
header = dbr$header,
parameters = dbr$params,
queries = parseQueries(query_elems)
)
}
parseBlastDatabaseReport <- function(doc) {
list(
header = new_BlastHeader(
version = xvalue(doc, '//BlastOutput/BlastOutput_version'),
reference = xvalue(doc, '//BlastOutput/BlastOutput_reference'),
database = xvalue(doc, '/BlastOutput/BlastOutput_db')
),
params = new_BlastParameters(
program = xvalue(doc, '/BlastOutput/BlastOutput_program'),
matrix = xvalue(doc, '/BlastOutput/BlastOutput_param/Parameters/Parameters_matrix'),
expect = xvalue(doc, '/BlastOutput/BlastOutput_param/Parameters/Parameters_expect', as = 'numeric'),
penalties = {
open <- xvalue(doc, '/BlastOutput/BlastOutput_param/Parameters/Parameters_gap-open', as = 'integer')
extend <- xvalue(doc, '/BlastOutput/BlastOutput_param/Parameters/Parameters_gap-extend', as = 'integer')
c(open = open, extend = extend)
},
filter = xvalue(doc, '/BlastOutput/BlastOutput_param/Parameters/Parameters_filter'),
sc_match = xvalue(doc, '/BlastOutput/BlastOutput_param/Parameters/Parameters_sc-match', as = 'integer'),
sc_mismatch = xvalue(doc, '/BlastOutput/BlastOutput_param/Parameters/Parameters_sc-mismatch', as = 'integer'),
num_sequences = xvalue(doc, '//Iteration[position()=1]/Iteration_stat/Statistics/Statistics_db-num'),
num_letters = xvalue(doc, '//Iteration[position()=1]/Iteration_stat/Statistics/Statistics_db-len'),
hsp_length = xvalue(doc, '//Iteration[position()=1]/Iteration_stat/Statistics/Statistics_hsp-len', as = 'numeric'),
effective_space = xvalue(doc, '//Iteration[position()=1]/Iteration_stat/Statistics/Statistics_eff-space', as = 'numeric'),
ka_params = {
k <- xvalue(doc, '//Iteration[position()=1]/Iteration_stat/Statistics/Statistics_kappa', as = 'numeric')
lambda <- xvalue(doc, '//Iteration[position()=1]/Iteration_stat/Statistics/Statistics_lambda', as = 'numeric')
h <- xvalue(doc, '//Iteration[position()=1]/Iteration_stat/Statistics/Statistics_entropy', as = 'numeric')
c(k = k, lambda = lambda, h = h)
}
)
)
}
parseQueries <- function(query_elems) {
QueryList(
lapply(query_elems, function(elem) {
doc <- xmlDoc(elem)
hit_elems <- xpathApply(doc, '/Iteration/Iteration_hits/Hit')
query_env <- new.env(parent=emptyenv())
query_env[['iter_num']] <- xvalue(doc, '/Iteration/Iteration_iter-num', as='integer')
query_env[['query_id']] <- xvalue(doc, '/Iteration/Iteration_query-ID')
query_env[['query_def']] <- xvalue(doc, '/Iteration/Iteration_query-def')
query_env[['query_len']] <- xvalue(doc, '/Iteration/Iteration_query-len', as='integer')
message <- xvalue(doc, '/Iteration/Iteration_message')
new_Query(
iter_num = query_env[['iter_num']] , query_id = query_env[['query_id']],
query_def = query_env[['query_def']], query_len = query_env[['query_len']],
hits = message %|na|% parseHits(hit_elems, query_env),
query_env = query_env
)
})
)
}
parseHits <- function(hit_elems, query_env) {
HitList(
lapply(hit_elems, function(elem) {
doc <- xmlDoc(elem)
hsp_elems <- xpathApply(doc, '/Hit/Hit_hsps/Hsp')
hit_len <- xvalue(doc, '/Hit/Hit_len', as='integer')
query_env[['hit_len']] <- hit_len
new_Hit(
hit_num = xvalue(doc, '/Hit/Hit_num', as='integer'),
hit_def = Deflines(x=paste(xvalue(doc, '/Hit/Hit_id'),
xvalue(doc, '/Hit/Hit_def'))
),
hit_acc = xvalue(doc, '/Hit/Hit_accession'),
hit_len = hit_len,
hsps = parseHsps(hsp_elems, query_env),
query_env = query_env
)
}),
query_env = query_env
)
}
parseHsps <- function(hsp_elems, query_env) {
HspList(
lapply(hsp_elems, function(elem) {
doc <- xmlDoc(elem)
new_Hsp(
hsp_num = xvalue(doc, '/Hsp/Hsp_num', as='integer'),
bit_score = xvalue(doc, '/Hsp/Hsp_bit-score', as='numeric'),
score = xvalue(doc, "/Hsp/Hsp_score", as='numeric'),
evalue = xvalue(doc, "/Hsp/Hsp_evalue", as='numeric'),
query_from = xvalue(doc, "/Hsp/Hsp_query-from", as='integer'),
query_to = xvalue(doc, "/Hsp/Hsp_query-to", as='integer'),
hit_from = xvalue(doc, "/Hsp/Hsp_hit-from", as='integer'),
hit_to = xvalue(doc, "/Hsp/Hsp_hit-to", as='integer'),
query_frame = xvalue(doc, "/Hsp/Hsp_query-frame", as='integer'),
hit_frame = xvalue(doc, "/Hsp/Hsp_hit-frame", as='integer'),
identity = xvalue(doc, "/Hsp/Hsp_identity", as='integer'),
positive = xvalue(doc, "/Hsp/Hsp_positive", as='integer'),
gaps = xvalue(doc, "/Hsp/Hsp_gaps", as='integer'),
align_len = xvalue(doc, "/Hsp/Hsp_align-len", as='integer'),
qseq = BString(xvalue(doc, "/Hsp/Hsp_qseq")),
hseq = BString(xvalue(doc, "/Hsp/Hsp_hseq")),
match = BString(xvalue(doc, "//Hsp_midline")),
query_env = query_env
)
}),
query_env = query_env
)
}
#' Parse NCBI BLAST table files into \linkS4class{BlastTable} objects.
#'
#' @param x BLAST output in tabular format (file path or character string)
#' @return A \code{\linkS4class{blastTable}} object.
#' @rdname blastTable
#' @export
#' @examples
#' ##
blastTable <- function(x) {
## split multiple query results
if (is.character(x) && !is.na(x[2]) && grepl("^# Query:.*$", x[2])) {
start <- grep("^# Query:.*$", x) - 1
end <- unique(c(grep("^# Query:.*$", x)[-1] - 2, length(x)))
x <- Map(function(i) x[i], i = Map(seq, from = start, to = end))
}
res <- list()
for (blout in x) {
if (is.character(blout) && file.exists(blout)) {
cf <- count.fields(blout, sep = "\t", comment.char = "#")
file_path <- file(blout, open = "r")
} else {
cf <- count.fields(textConnection(as.character(blout)),
sep = "\t", comment.char = "#")
file_path <- textConnection(as.character(blout))
}
if (!all(cf == 12)) {
stop(sprintf("Malformed blast table. %s columns in row %s.\n",
sQuote(cf[cf > 12]), sQuote(which(cf > 12))))
}
hasComment <- TRUE
comStr <- NULL
while (hasComment) {
line <- readLines(file_path, n = 1)
if (hasComment <- str_detect(line, "^#")) {
comStr <- c(comStr, str_match(line, "[^(# *)].*"))
}
}
pushBack(line, connection = file_path)
hit_table <-
read.table(file_path, header = FALSE, sep = "\t",
as.is = TRUE, nrows = length(cf),
col.names = c("qid", "sid", "pident",
"length", "mismatch", "gapopen",
"qstart", "qend", "sstart",
"send","evalue","bit_score"),
colClasses = c("character", "character", "numeric",
"integer", "integer", "integer",
"integer", "integer", "integer",
"integer", "numeric", "numeric"))
## parse subjectIds in 'hit_table'
all_ids <- with(hit_table, strsplit(sid, "\\|"))
gi <- vapply(all_ids, '[', 2, FUN.VALUE = '')
source_tag <- vapply(all_ids, '[', 3, FUN.VALUE = '')
accn <- ifelse(grepl("pdb", source_tag),
paste0(sapply(all_ids, '[', 4), "_", sapply(all_ids, '[', 5)),
sapply(all_ids, '[', 4))
if (all(is.na(accn))) {
accn <- hit_table[["sid"]]
}
neg_log_evalue <- with(hit_table, -log(as.numeric(evalue)))
neg_log_evalue[is.infinite(neg_log_evalue)] <- -log(1e-323)
if (!is.null(comStr) && length(comStr) == 5) {
program <- comStr[1]
query <- str_split_fixed(comStr[2], "Query: ", 2)[, 2]
database <- str_split_fixed(comStr[3], "Database: ", 2)[, 2]
} else {
program <- query <- database <- NA_character_
}
close(file_path)
res <- c(res, new('BlastTable', program = program, database = database,
query = query,
bit_score = as.numeric(hit_table[["bit_score"]]),
evalue = as.numeric(hit_table[["evalue"]]),
mlog.evalue = neg_log_evalue,
geneid = gi,
accession = accn,
table = hit_table))
}
if (length(res) == 1) {
res[[1]]
} else res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.