#' @include partition.R partition_bundle.R S4classes.R
NULL
#' Generate html from object.
#'
#' Prepare html document to see full text.
#'
#' @return Returns an object of class `html` as used in the 'htmltools' package.
#' Methods such as `htmltools::html_print()` will be available. The encoding
#' of the html document will be UTF-8 on all systems (including Windows).
#'
#' @details Substitutions configured by option 'polmineR.mdsub' are applied to
#' prevent presence of characters that would be misinterpreted as markdown
#' formatting instructions.
#' @details If param `charoffset} is `TRUE``, character offset positions will be
#' added to tags that embrace tokens. This may be useful, if exported html
#' document is annotated with a tool that stores annotations with character
#' offset positions.
#'
#' @param object The object the fulltext output will be based on.
#' @param meta Metadata to include in output, if `NULL` (default), the
#' s-attributes defining a partition will be used.
#' @param s_attribute Structural attributes that will be used to define the
#' partition where the match occurred.
#' @param cpos Length-one `logical` value, if `TRUE` (default), all
#' tokens will be wrapped by elements with id attribute indicating corpus
#' positions.
#' @param corpus The ID of the corpus, a length-one `character` vector.
#' @param beautify A `logical` value, if `TRUE`, whitespace before
#' interpunctuation will be removed.
#' @param charoffset Length-one `logical` value, if `TRUE`, character offset
#' positions are added to elements embracing tokens.
#' @param height A `character` vector that will be inserted into the html as an
#' optional height of a scroll box.
#' @param verbose Length-one `logical` value, whether to output progress
#' messages.
#' @param progress Length-one `logical` value, whether to output progress bar.
#' @param cutoff An `integer` value, maximum number of tokens to decode
#' from token stream, passed into `as.markdown()`.
#' @param type The partition type.
#' @param i An `integer` value: If `object` is a `kwic`-object,
#' the index of the concordance for which the fulltext is to be generated.
#' @param ... Further parameters that are passed into `as.markdown()`.
#' @rdname html-method
#' @aliases show,html-method
#' @examples
#' use(pkg = "RcppCWB", corpus = "REUTERS")
#'
#' P <- partition("REUTERS", places = "argentina")
#' H <- html(P)
#' if (interactive()) H # show full text in viewer pane
#'
#' # html-method can be used in a pipe
#' H <- partition("REUTERS", places = "argentina") %>% html()
#'
#' # use html-method to get full text where concordance occurrs
#' K <- kwic("REUTERS", query = "barrels")
#' H <- html(K, i = 1, s_attribute = "id")
#' H <- html(K, i = 2, s_attribute = "id")
#' for (i in 1L:length(K)) {
#' H <- html(K, i = i, s_attribute = "id")
#' if (interactive()){
#' show(H)
#' userinput <- readline("press 'q' to quit or any other key to continue")
#' if (userinput == "q") break
#' }
#' }
#'
setGeneric("html", function(object, ...) standardGeneric("html") )
#' @exportMethod html
#' @rdname html-method
setMethod("html", "character", function(object, corpus, height = NULL){
if (!requireNamespace("markdown", quietly = TRUE))
stop("package 'markdown' is not installed, but necessary for this function")
css <- paste(
c(
readLines(system.file("css", "markdown.css", package = "polmineR")),
readLines(system.file("css", "tooltips.css", package = "polmineR"))
),
collapse = "\n", sep = "\n"
)
for (i in seq_along(getOption("polmineR.mdsub"))){
object <- gsub(
getOption("polmineR.mdsub")[[i]][1],
getOption("polmineR.mdsub")[[i]][2],
object
)
}
# produce result very similar to markdown::markdownToHTML, but selfmade to
# circumvent encoding issue on windows (poor handling of encodings other
# than UTF-8 by markdownToHTML)
template <- "<!DOCTYPE html>\n<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/>\n<title>%s</title>\n<style type=\"text/css\">\n%s\n</style>\n</head>\n<body>\n%s\n</body>\n</html>"
doc <- sprintf(
template,
sprintf("Corpus: %s", corpus), # title
css,
markdown::renderMarkdown(text = object)
)
if (!is.null(height)){
fmt <- '%s<div style="border: 1px solid #ddd; padding: 5px; overflow-y: scroll; height: %s;">%s</div></body></html>'
doc <- sprintf(
fmt,
gsub("^(.*?)<body>\n.*?</body>.*?$", "\\1", doc),
height,
gsub("^.*?<body>\n(.*?)</body>.*?$", "\\1", doc)
)
doc <- gsub("<h3>", '<h3 class="fulltext">', doc)
}
doc
})
#' @importFrom xml2 read_html xml_find_all xml_text xml_parent xml_attr
#' @importFrom xml2 xml_attrs xml_remove xml_attr<- xml_name
#' @importFrom stringi stri_extract_all_boundaries
.addCharacterOffset <- function(x){
# check that required dependencies are present
if (!requireNamespace("xml2", quietly = TRUE))
stop("package 'xml2' not installed (please install)")
if (!requireNamespace("htmltools", quietly = TRUE))
stop("package 'htmltools' not installed (please install")
doc <- read_html(x)
textnodes <- xml_find_all(doc, xpath = "//body//text()")
nodetext <- sapply(textnodes, xml_text)
nchar_textnodes <- sapply(nodetext, nchar)
no_linesplit <- sapply(stri_extract_all_boundaries(nodetext), function(x) length(grep("\\n", x)))
nchar_textnodes <- nchar_textnodes - ifelse(no_linesplit > 0, 1, 0)
offset <- c(0L, cumsum(nchar_textnodes)[1L:(length(nchar_textnodes) - 1L)])
dummy <- lapply(
1L:length(textnodes),
function(i){
if (!grepl("^\\s*$", nodetext[i])){
parent <- xml2::xml_parent(textnodes[[i]])
if (xml_name(parent) == "span"){
xml_attr(parent, "left") <- as.character(offset[i])
xml_attr(parent, "right") <- offset[i] + nchar(nodetext[i])
}
}
return(NULL)
}
)
return( as.character(doc) )
}
#' @importFrom xml2 xml_find_first xml_text<-
.beautify <- function(x){
doc <- xml2::read_html(x)
lapply(
c(",", ".", ":", ";", "!", "?"),
function(interpunctuation){
nodes <- xml2::xml_find_all(
doc,
xpath = sprintf("//span[starts-with(@token, '%s')]", interpunctuation)
)
if (length(nodes) >= 1){
lapply(
1L:length(nodes),
function(j){
precedingTextNode <- xml_find_first(
nodes[[j]],
xpath = "./preceding-sibling::text()[1]"
)
if (!is.na(xml_text(precedingTextNode))){
if (xml_text(precedingTextNode) == " ") xml_text(precedingTextNode) <- ""
}
}
)
}
}
)
# remove continuing double quotes
nodes <- xml_find_all(doc, xpath = sprintf('//span[@token = "%s"]', "''"))
if (length(nodes) >= 2){
cpos <- as.integer(sapply(lapply(nodes, xml_attrs), function(x) x["id"]))
for (i in 2L:length(cpos)){
if (cpos[i - 1L] + 1L == cpos[i]){
node_to_remove <- xml_find_first(doc, xpath = sprintf('//span[@id = "%d"]', cpos[i]))
if (length(node_to_remove) > 0) xml_remove(node_to_remove)
}
}
}
as.character(doc)
}
#' @rdname html-method
setMethod(
"html", "partition",
function(object, meta = NULL, cpos = TRUE, verbose = FALSE, cutoff = NULL, charoffset = FALSE, beautify = TRUE, height = NULL, ...){
newobj <- if (is.null(get_type(object))){
"subcorpus"
} else {
switch( get_type(object), "plpr" = "plpr_subcorpus", "press" = "press_subcorpus" )
}
html(
object = as(object, newobj),
meta = meta,
cpos = cpos,
verbose = verbose,
cutoff = cutoff,
charoffset = charoffset,
beautify = beautify,
height = height,
...
)
})
#' @rdname html-method
#' @exportMethod html
#' @docType methods
setMethod(
"html", "subcorpus",
function(object, meta = NULL, cpos = TRUE, verbose = FALSE, cutoff = NULL, charoffset = FALSE, beautify = FALSE, height = NULL, ...){
if (!requireNamespace("markdown", quietly = TRUE) && requireNamespace("htmltools", quietly = TRUE)){
stop("package 'markdown' is not installed, but necessary for this function")
}
# code to get meta (almost) identical with read,subcorpus()
if (is.null(meta)){
template_meta <- get_template(object)[["metadata"]]
meta <- if (is.null(template_meta))
names(object@s_attributes)
else
template_meta
}
if (isFALSE(all(meta %in% s_attributes(object))))
warning("not all s-attributes provided as meta are available")
.message("generating markdown", verbose = verbose)
md <- as.markdown(object, meta = meta, cpos = cpos, cutoff = cutoff, ...)
.message("markdown to html", verbose = verbose)
# the css for tooltips is included in the html document by default,
# even if this causes some (minimal) overhead.
# The default stylesheet (markdown.css) needs to be included explicitly,
# so that it is not lost.
doc <- html(object = md, corpus = object@corpus, height = height)
if (beautify) doc <- .beautify(doc)
if (charoffset) doc <- .addCharacterOffset(doc)
ret <- htmltools::HTML(doc)
attr(ret, "browsable_html") <- TRUE
ret
}
)
#' @docType methods
#' @rdname html-method
setMethod("html", "partition_bundle", function(object, charoffset = FALSE, beautify = TRUE, height = NULL, progress = TRUE, ...){
corpus_id <- get_corpus(object)
if (!requireNamespace("markdown", quietly = TRUE))
stop("package 'markdown' is not installed, but necessary for this function")
md_list <- if (isTRUE(progress)){
pblapply(object@objects, function(p) as.markdown(p, ...))
} else {
lapply(object@objects, function(p) as.markdown(p, ...))
}
md <- paste(md_list, collapse = "\n* * *\n")
md <- paste(
paste('## Excerpt from corpus', corpus_id, '\n* * *\n'),
md,
'\n* * *\n',
collapse = "\n"
)
doc <- html(object = md, corpus = corpus_id, height = height)
if (beautify) doc <- .beautify(doc)
if (charoffset) doc <- .addCharacterOffset(doc)
ret <- htmltools::HTML(doc)
attr(ret, "browsable_html") <- TRUE
ret
})
#' @rdname html-method
setMethod("html", "kwic", function(object, i, s_attribute = NULL, type = NULL, verbose = FALSE){
# getting metadata for all kwic lines is potentially not the fastes solution ...
if (!is.null(s_attribute)){
if (!s_attribute %in% s_attributes(object@corpus))
stop("s-attribute provided is not available")
s_attrs <- s_attribute
object <- enrich(object, s_attributes = s_attrs)
} else if (length(object@metadata) == 0L){
s_attrs <- get_template(object)[["metadata"]]
if (is.null(s_attrs)){
stop(
"The html()-method for kwic objects requires ",
"an explicit statement of an s-attribute in the method call, ",
"an implicit declaration via the kwic metadata ",
"or a statement in a template. Requirement not met."
)
} else {
.message("using metadata from template: ", paste(s_attrs, collapse = " / "), verbose = verbose)
if (length(s_attrs) > 0L){
.message("enriching", verbose = verbose)
object <- enrich(object, meta = s_attrs)
}
}
} else {
s_attrs <- object@metadata
}
partition_to_read <- partition(
object@corpus,
def = lapply(setNames(s_attrs, s_attrs), function(x) object@stat[[x]][i]),
type = type
)
.message("generating html", verbose = verbose)
fulltext <- polmineR::html(partition_to_read, meta = s_attrs, cpos = TRUE)
.message("generating highlights", verbose = verbose)
tabSubset <- object@cpos[which(object@cpos[["match_id"]] == i)]
cposContext <- tabSubset[which(tabSubset[["position"]] != 0)][["cpos"]]
cposNode <- tabSubset[which(tabSubset[["position"]] == 0)][["cpos"]]
fulltext <- highlight(
fulltext,
highlight = list(yellow = cposContext, lightgreen = cposNode)
)
fulltext
})
#' @rdname html-method
setMethod("html", "remote_subcorpus", function(object, ...){
ocpu_exec(fn = "html", corpus = object@corpus, server = object@server, restricted = object@restricted, object = as(object, "subcorpus"), ...)
})
#' @include polmineR.R
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.