Nothing
#' @include partition.R partition_bundle.R S4classes.R
NULL
#' Generate html from object.
#'
#' Prepare html document to see full text.
#'
#' @return Returns an object of class `html` as used in the 'htmltools' package.
#' Methods such as `htmltools::html_print()` will be available. The encoding
#' of the html document will be UTF-8 on all systems (including Windows).
#'
#' @details Substitutions configured by option 'polmineR.mdsub' are applied to
#' prevent presence of characters that would be misinterpreted as markdown
#' formatting instructions.
#' @details If param `charoffset} is `TRUE``, character offset positions will be
#' added to tags that embrace tokens. This may be useful, if exported html
#' document is annotated with a tool that stores annotations with character
#' offset positions.
#'
#' @param object The object the fulltext output will be based on.
#' @param meta Metadata to include in output, if `NULL` (default), the
#' s-attributes defining a partition will be used.
#' @param s_attribute Structural attributes that will be used to define the
#' partition where the match occurred.
#' @param cpos Length-one `logical` value, if `TRUE` (default), all
#' tokens will be wrapped by elements with id attribute indicating corpus
#' positions.
#' @param corpus The ID of the corpus, a length-one `character` vector.
#' @param beautify A `logical` value, if `TRUE`, whitespace before
#' interpunctuation will be removed.
#' @param charoffset Length-one `logical` value, if `TRUE`, character offset
#' positions are added to elements embracing tokens.
#' @param height A `character` vector that will be inserted into the html as an
#' optional height of a scroll box.
#' @param verbose Length-one `logical` value, whether to output progress
#' messages.
#' @param progress Length-one `logical` value, whether to output progress bar.
#' @param cutoff An `integer` value, maximum number of tokens to decode
#' from token stream, passed into `as.markdown()`.
#' @param type The partition type.
#' @param i An `integer` value: If `object` is a `kwic`-object,
#' the index of the concordance for which the fulltext is to be generated.
#' @param ... Further parameters that are passed into `as.markdown()`.
#' @rdname html-method
#' @aliases show,html-method
#' @examples
#' use(pkg = "RcppCWB", corpus = "REUTERS")
#'
#' P <- partition("REUTERS", places = "argentina")
#' H <- html(P)
#' if (interactive()) H # show full text in viewer pane
#'
#' # html-method can be used in a pipe
#' H <- partition("REUTERS", places = "argentina") %>% html()
#'
#' # use html-method to get full text where concordance occurrs
#' K <- kwic("REUTERS", query = "barrels")
#' H <- html(K, i = 1, s_attribute = "id")
#' H <- html(K, i = 2, s_attribute = "id")
#' for (i in 1L:length(K)) {
#' H <- html(K, i = i, s_attribute = "id")
#' if (interactive()){
#' show(H)
#' userinput <- readline("press 'q' to quit or any other key to continue")
#' if (userinput == "q") break
#' }
#' }
#'
setGeneric("html", function(object, ...) standardGeneric("html") )
#' @exportMethod html
#' @rdname html-method
setMethod("html", "character", function(object, corpus, height = NULL){
if (!requireNamespace("markdown", quietly = TRUE))
stop("package 'markdown' is not installed, but necessary for this function")
css <- paste(
c(
readLines(system.file("css", "markdown.css", package = "polmineR")),
readLines(system.file("css", "tooltips.css", package = "polmineR"))
),
collapse = "\n", sep = "\n"
)
for (i in seq_along(getOption("polmineR.mdsub"))){
object <- gsub(
getOption("polmineR.mdsub")[[i]][1],
getOption("polmineR.mdsub")[[i]][2],
object
)
}
# produce result very similar to markdown::markdownToHTML, but selfmade to
# circumvent encoding issue on windows (poor handling of encodings other
# than UTF-8 by markdownToHTML)
template <- "<!DOCTYPE html>\n<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/>\n<title>%s</title>\n<style type=\"text/css\">\n%s\n</style>\n</head>\n<body>\n%s\n</body>\n</html>"
doc <- sprintf(
template,
sprintf("Corpus: %s", corpus), # title
css,
markdown::renderMarkdown(text = object)
)
if (!is.null(height)){
fmt <- '%s<div style="border: 1px solid #ddd; padding: 5px; overflow-y: scroll; height: %s;">%s</div></body></html>'
doc <- sprintf(
fmt,
gsub("^(.*?)<body>\n.*?</body>.*?$", "\\1", doc),
height,
gsub("^.*?<body>\n(.*?)</body>.*?$", "\\1", doc)
)
doc <- gsub("<h3>", '<h3 class="fulltext">', doc)
}
doc
})
#' @importFrom xml2 read_html xml_find_all xml_text xml_parent xml_attr
#' @importFrom xml2 xml_attrs xml_remove xml_attr<- xml_name
#' @importFrom stringi stri_extract_all_boundaries
.addCharacterOffset <- function(x){
# check that required dependencies are present
if (!requireNamespace("xml2", quietly = TRUE))
stop("package 'xml2' not installed (please install)")
if (!requireNamespace("htmltools", quietly = TRUE))
stop("package 'htmltools' not installed (please install")
doc <- read_html(x)
textnodes <- xml_find_all(doc, xpath = "//body//text()")
nodetext <- sapply(textnodes, xml_text)
nchar_textnodes <- sapply(nodetext, nchar)
no_linesplit <- sapply(stri_extract_all_boundaries(nodetext), function(x) length(grep("\\n", x)))
nchar_textnodes <- nchar_textnodes - ifelse(no_linesplit > 0, 1, 0)
offset <- c(0L, cumsum(nchar_textnodes)[1L:(length(nchar_textnodes) - 1L)])
dummy <- lapply(
1L:length(textnodes),
function(i){
if (!grepl("^\\s*$", nodetext[i])){
parent <- xml2::xml_parent(textnodes[[i]])
if (xml_name(parent) == "span"){
xml_attr(parent, "left") <- as.character(offset[i])
xml_attr(parent, "right") <- offset[i] + nchar(nodetext[i])
}
}
return(NULL)
}
)
return( as.character(doc) )
}
#' @importFrom xml2 xml_find_first xml_text<-
.beautify <- function(x){
doc <- xml2::read_html(x)
lapply(
c(",", ".", ":", ";", "!", "?"),
function(interpunctuation){
nodes <- xml2::xml_find_all(
doc,
xpath = sprintf("//span[starts-with(@token, '%s')]", interpunctuation)
)
if (length(nodes) >= 1){
lapply(
1L:length(nodes),
function(j){
precedingTextNode <- xml_find_first(
nodes[[j]],
xpath = "./preceding-sibling::text()[1]"
)
if (!is.na(xml_text(precedingTextNode))){
if (xml_text(precedingTextNode) == " ") xml_text(precedingTextNode) <- ""
}
}
)
}
}
)
# remove continuing double quotes
nodes <- xml_find_all(doc, xpath = sprintf('//span[@token = "%s"]', "''"))
if (length(nodes) >= 2){
cpos <- as.integer(sapply(lapply(nodes, xml_attrs), function(x) x["id"]))
for (i in 2L:length(cpos)){
if (cpos[i - 1L] + 1L == cpos[i]){
node_to_remove <- xml_find_first(doc, xpath = sprintf('//span[@id = "%d"]', cpos[i]))
if (length(node_to_remove) > 0) xml_remove(node_to_remove)
}
}
}
as.character(doc)
}
#' @rdname html-method
setMethod(
"html", "partition",
function(object, meta = NULL, cpos = TRUE, verbose = FALSE, cutoff = NULL, charoffset = FALSE, beautify = TRUE, height = NULL, ...){
newobj <- if (is.null(get_type(object))){
"subcorpus"
} else {
switch( get_type(object), "plpr" = "plpr_subcorpus", "press" = "press_subcorpus" )
}
html(
object = as(object, newobj),
meta = meta,
cpos = cpos,
verbose = verbose,
cutoff = cutoff,
charoffset = charoffset,
beautify = beautify,
height = height,
...
)
})
#' @rdname html-method
#' @exportMethod html
#' @docType methods
setMethod(
"html", "subcorpus",
function(object, meta = NULL, cpos = TRUE, verbose = FALSE, cutoff = NULL, charoffset = FALSE, beautify = FALSE, height = NULL, ...){
if (!requireNamespace("markdown", quietly = TRUE) && requireNamespace("htmltools", quietly = TRUE)){
stop("package 'markdown' is not installed, but necessary for this function")
}
# code to get meta (almost) identical with read,subcorpus()
if (is.null(meta)){
template_meta <- get_template(object)[["metadata"]]
meta <- if (is.null(template_meta))
names(object@s_attributes)
else
template_meta
}
if (isFALSE(all(meta %in% s_attributes(object))))
warning("not all s-attributes provided as meta are available")
.message("generating markdown", verbose = verbose)
md <- as.markdown(object, meta = meta, cpos = cpos, cutoff = cutoff, ...)
.message("markdown to html", verbose = verbose)
# the css for tooltips is included in the html document by default,
# even if this causes some (minimal) overhead.
# The default stylesheet (markdown.css) needs to be included explicitly,
# so that it is not lost.
doc <- html(object = md, corpus = object@corpus, height = height)
if (beautify) doc <- .beautify(doc)
if (charoffset) doc <- .addCharacterOffset(doc)
ret <- htmltools::HTML(doc)
attr(ret, "browsable_html") <- TRUE
ret
}
)
#' @docType methods
#' @rdname html-method
setMethod("html", "partition_bundle", function(object, charoffset = FALSE, beautify = TRUE, height = NULL, progress = TRUE, ...){
corpus_id <- get_corpus(object)
if (!requireNamespace("markdown", quietly = TRUE))
stop("package 'markdown' is not installed, but necessary for this function")
md_list <- if (isTRUE(progress)){
pblapply(object@objects, function(p) as.markdown(p, ...))
} else {
lapply(object@objects, function(p) as.markdown(p, ...))
}
md <- paste(md_list, collapse = "\n* * *\n")
md <- paste(
paste('## Excerpt from corpus', corpus_id, '\n* * *\n'),
md,
'\n* * *\n',
collapse = "\n"
)
doc <- html(object = md, corpus = corpus_id, height = height)
if (beautify) doc <- .beautify(doc)
if (charoffset) doc <- .addCharacterOffset(doc)
ret <- htmltools::HTML(doc)
attr(ret, "browsable_html") <- TRUE
ret
})
#' @rdname html-method
setMethod("html", "kwic", function(object, i, s_attribute = NULL, type = NULL, verbose = FALSE){
# getting metadata for all kwic lines is potentially not the fastes solution ...
if (!is.null(s_attribute)){
if (!s_attribute %in% s_attributes(object@corpus))
stop("s-attribute provided is not available")
s_attrs <- s_attribute
object <- enrich(object, s_attributes = s_attrs)
} else if (length(object@metadata) == 0L){
s_attrs <- get_template(object)[["metadata"]]
if (is.null(s_attrs)){
stop(
"The html()-method for kwic objects requires ",
"an explicit statement of an s-attribute in the method call, ",
"an implicit declaration via the kwic metadata ",
"or a statement in a template. Requirement not met."
)
} else {
.message("using metadata from template: ", paste(s_attrs, collapse = " / "), verbose = verbose)
if (length(s_attrs) > 0L){
.message("enriching", verbose = verbose)
object <- enrich(object, meta = s_attrs)
}
}
} else {
s_attrs <- object@metadata
}
partition_to_read <- partition(
object@corpus,
def = lapply(setNames(s_attrs, s_attrs), function(x) object@stat[[x]][i]),
type = type
)
.message("generating html", verbose = verbose)
fulltext <- polmineR::html(partition_to_read, meta = s_attrs, cpos = TRUE)
.message("generating highlights", verbose = verbose)
tabSubset <- object@cpos[which(object@cpos[["match_id"]] == i)]
cposContext <- tabSubset[which(tabSubset[["position"]] != 0)][["cpos"]]
cposNode <- tabSubset[which(tabSubset[["position"]] == 0)][["cpos"]]
fulltext <- highlight(
fulltext,
highlight = list(yellow = cposContext, lightgreen = cposNode)
)
fulltext
})
#' @rdname html-method
setMethod("html", "remote_subcorpus", function(object, ...){
ocpu_exec(fn = "html", corpus = object@corpus, server = object@server, restricted = object@restricted, object = as(object, "subcorpus"), ...)
})
#' @include polmineR.R
NULL
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.