Nothing
#' @name decode-method
#' @aliases decode
#' @rdname decode
setGeneric("decode", function(.Object, ...) standardGeneric("decode"))
setOldClass("Annotation")
setOldClass("AnnotatedPlainTextDocument")
setAs(from = "corpus", to = "Annotation", def = function(from){
# Implemented only for class 'corpus', 'subcorpus'-class will inherit from it.
if (!requireNamespace(package = "NLP", quietly = TRUE))
stop("Package 'NLP' required but not available")
word <- get_token_stream(
from@cpos,
corpus = from@corpus,
p_attribute = "word",
encoding = from@encoding
)
if (!"pos" %in% p_attributes(from)) stop("p-attribute not available")
pos <- get_token_stream(
from@cpos,
corpus = from@corpus,
p_attribute = "pos",
encoding = from@encoding
)
ws_after <- c(ifelse(pos %in% c("$.", "$,"), FALSE, TRUE)[-1], FALSE)
word_with_ws <- paste(word, ifelse(ws_after, " ", ""), sep = "")
s <- paste(word_with_ws, collapse = "")
word_length <- sapply(word, nchar)
left_offset <- c(
1L,
(cumsum(sapply(word_with_ws, nchar)) + 1L)[1L:(length(word) - 1L)]
)
names(left_offset) <- word
right_offset <- left_offset + word_length - 1L
names(right_offset) <- word
cpos <- ranges_to_cpos(from@cpos)
w <- NLP::Annotation(
id = cpos,
rep.int("word", length(cpos)),
start = left_offset,
end = right_offset
)
right_offset <- left_offset + word_length
names(right_offset) <- word # repeats
m <- matrix(data = c(left_offset, right_offset), ncol = 2, byrow = FALSE)
f <- cut(
x = 1L:length(pos),
breaks = unique(c(1L, grep("\\$\\.", pos), length(pos))),
include.lowest = TRUE
)
chunks <- split(x = m, f = f)
sentence_left <- sapply(chunks, min)
sentence_right <- sapply(chunks, max) - 1L
s <- NLP::Annotation(
id = 1L:length(sentence_left),
rep.int("sentence", length(sentence_left)),
start = sentence_left,
end = sentence_right
)
c(w, s)
})
#' @importFrom RcppCWB region_matrix_to_struc_matrix
as.AnnotatedPlainTextDocument <- function(x, p_attributes = NULL, s_attributes = NULL, mw = NULL, stoplist = NULL, verbose = TRUE){
# is required to be a subcorpus object
stopifnot(
inherits(x, "subcorpus"),
length(mw) == length(unique(mw))
)
if (!requireNamespace(package = "NLP", quietly = TRUE))
stop("Package 'NLP' required but not available.")
if (verbose) cli_alert_info("decode p-attributes")
p_attrs <- if (is.null(p_attributes)) p_attributes(x) else p_attributes
ts <- decode(
x,
to = "data.table",
p_attribute = p_attrs,
s_attributes = character(),
verbose = verbose
)
if (!is.null(stoplist)) ts <- ts[!ts[["word"]] %in% stoplist]
if (verbose) cli_progress_step("generate plain text string")
ws_after <- c(!grepl("^[.,;:!?]$", ts[["word"]])[2L:nrow(ts)], FALSE)
word_with_ws <- paste(ts[["word"]], ifelse(ws_after, " ", ""), sep = "")
s <- stringi::stri_c(word_with_ws, collapse = "")
if (verbose) cli_progress_step("generate token-level annotation")
left_offset <- c(1L, (cumsum(nchar(word_with_ws)) + 1L)[1L:(nrow(ts) - 1L)])
right_offset <- left_offset + nchar(ts[["word"]]) - 1L
w <- NLP::Annotation(
id = ts[["cpos"]],
rep.int("word", nrow(ts)),
start = left_offset,
end = right_offset,
features = lapply(
split(ts[, p_attrs, with = FALSE], f = 1L:nrow(ts)),
as.data.frame
)
)
mw_annotations <- lapply(
mw,
function(s_attr){
if (verbose)
cli_progress_step(
"generate annotation for s-attribute {.val {s_attr}}"
)
struc_matrix <- RcppCWB::region_matrix_to_struc_matrix(
corpus = x@corpus,
registry = x@registry_dir,
region_matrix = x@cpos,
s_attribute = s_attr
)
strucs <- ranges_to_cpos(struc_matrix)
strucs_min <- unique(strucs[strucs >= 0L])
if (length(strucs_min) == 0L) return(NULL)
regions <- get_region_matrix(
corpus = x@corpus,
s_attribute = s_attr,
strucs = strucs_min,
registry = x@registry_dir
)
str <- RcppCWB::cl_struc2str(
corpus = x@corpus,
registry = x@registry_dir,
s_attribute = s_attr,
struc = strucs_min
)
Encoding(str) <- x@encoding
str <- as.nativeEnc(x = str, from = x@encoding)
data <- lapply(
1:nrow(regions),
function(i){
matching <- which(w$id %in% regions[i,1]:regions[i,2])
# If the region covers tokens that have been dropped due to the
# stoplist, no matches - omit these cases.
if (length(matching) > 0L){
txt <- stri_sub(
str = s,
from = w[min(matching)]$start,
to = w[max(matching)]$end
)
list(
kind = str[i],
text = txt,
constituents = regions[i,1]:regions[i,2],
start = w[min(matching)]$start,
end = w[max(matching)]$end
)
} else {
NULL
}
}
)
nomatch <- sapply(data, length) == 0
if (any(nomatch)) data <- data[-which(nomatch)]
NLP::Annotation(
id = seq_along(data),
type = rep.int(
if (is.null(names(mw))) s_attr else names(mw)[which(mw == s_attr)],
length(data)
),
start = unname(unlist(lapply(data, `[[`, "start"))),
end = unname(unlist(lapply(data, `[[`, "end"))),
features = lapply(data, `[`, c("kind", "text", "constituents"))
)
}
)
if (verbose) cli_progress_step("retrieve metadata")
meta <- lapply(
setNames(s_attributes, s_attributes),
function(s_attr){
struc <- cl_cpos2struc(
corpus = x@corpus,
registry = x@registry_dir,
s_attribute = s_attr,
cpos = x@cpos[1,1]
)
value <- cl_struc2str(
corpus = x@corpus,
registry = x@registry_dir,
s_attribute = s_attr,
struc = struc
)
Encoding(value) <- x@encoding
as.nativeEnc(x = value, from = x@encoding)
}
)
if (verbose) cli_progress_step("instantiate AnnotatedPlainTextDocument")
NLP::AnnotatedPlainTextDocument(
s = s,
a = c(w, do.call(c, mw_annotations)),
meta = meta
)
}
#' Decode corpus or subcorpus.
#'
#' Decode `corpus` or `subcorpus` and return class specified by argument `to`.
#'
#' The primary purpose of the method is type conversion. By obtaining the corpus
#' or subcorpus in the format specified by the argument `to`, the data can be
#' processed with tools that do not rely on the Corpus Workbench (CWB).
#' Supported output formats are `data.table` (which can be converted to a
#' `data.frame` or `tibble` easily) or an `Annotation` object as defined in the
#' package 'NLP'. Another purpose of decoding the corpus can be to rework it,
#' and to re-import it into the CWB (e.g. using the 'cwbtools'-package).
#'
#' An earlier version of the method included an option to decode a single
#' s-attribute, which is not supported any more. See the `s_attribute_decode()`
#' function of the package RcppCWB.
#'
#' @return The return value will correspond to the class specified by argument
#' `to`.
#'
#' @param .Object The `corpus` or `subcorpus` to decode.
#' @param to The class of the returned object, stated as a length-one
#' `character` vector.
#' @param s_attributes The structural attributes to decode. If `NULL` (default),
#' all structural attributes will be decoded.
#' @param p_attributes The positional attributes to decode. If `NULL` (default),
#' all positional attributes will be decoded.
#' @param stoplist A `character` vector with terms to be dropped.
#' @param decode A `logical` value, whether to decode token ids and struc ids to
#' character strings. If `FALSE`, the values of columns for p- and
#' s-attributes will be `integer` vectors. If `TRUE` (default), the respective
#' columns are `character` vectors.
#' @param mw A `character` vector with s-attributes with multiword expressions.
#' Used only if `to` is 'AnnotatedPlainTextDocument'.
#' @param ... Further arguments.
#' @param verbose A `logical` value, whether to output progess messages.
#' @exportMethod decode
#' @importFrom RcppCWB get_region_matrix
#' @seealso To decode a structural attribute, you can use the
#' \code{\link{s_attributes}}-method, setting argument \code{unique} as
#' \code{FALSE} and \code{\link[RcppCWB]{s_attribute_decode}}. See
#' \code{\link{as.VCorpus}} to decode a \code{partition_bundle} object,
#' returning a \code{VCorpus} object.
#' @examples
#' use("polmineR")
#' use(pkg = "RcppCWB", corpus = "REUTERS")
#'
#' # Decode corpus as data.table
#' dt <- decode("REUTERS", to = "data.table")
#'
#' # Decode corpus selectively
#' dt <- decode(
#' "REUTERS",
#' to = "data.table",
#' p_attributes = "word",
#' s_attributes = "id"
#' )
#'
#' # Decode a subcorpus
#' dt <- corpus("REUTERS") %>%
#' subset(id %in% c("127", "144")) %>%
#' decode(s_attributes = "id", to = "data.table")
#'
#' # Decode partition
#' dt <- partition("REUTERS", places = "kuwait", regex = TRUE) %>%
#' decode(to = "data.table")
#'
#' # Previous versions of polmineR offered an option to decode a single
#' # s-attribute. This is how you could proceed to get a table with metadata.
#' dt <- partition("REUTERS", places = "kuwait", regex = TRUE) %>%
#' decode(s_attribute = "id", decode = FALSE, to = "data.table")
#' dt[, "word" := NULL]
#' dt[,{list(cpos_left = min(.SD[["cpos"]]), cpos_right = max(.SD[["cpos"]]))}, by = "id"]
#'
#' # Decode subcorpus as Annotation object
#' \dontrun{
#' if (requireNamespace("NLP")){
#' library(NLP)
#' p <- corpus("GERMAPARLMINI") %>%
#' subset(date == "2009-11-10" & speaker == "Angela Dorothea Merkel")
#' s <- as(p, "String")
#' a <- as(p, "Annotation")
#'
#' # The beauty of having this NLP Annotation object is that you can now use
#' # the different annotators of the openNLP package. Here, just a short scenario
#' # how you can have a look at the tokenized words and the sentences.
#'
#' words <- s[a[a$type == "word"]]
#' sentences <- s[a[a$type == "sentence"]] # does not yet work perfectly for plenary protocols
#'
#' doc <- decode(p, to = "AnnotatedPlainTextDocument")
#' }
#' }
#' @rdname decode
#' @importFrom cli cli_progress_step
setMethod("decode", "corpus", function(.Object, to = c("data.table", "Annotation", "AnnotatedPlainTextDocument"), p_attributes = NULL, s_attributes = NULL, mw = NULL, stoplist = NULL, decode = TRUE, verbose = TRUE){
if (length(to) != 1L)
stop("`decode()` - argument 'to' required to have length 1")
if (!to %in% c("data.table", "Annotation", "AnnotatedPlainTextDocument")){
cli_alert_danger(
"Argument 'to' of `decode()` required to be 'data.table' or 'Annotation'."
)
stop()
}
if (to == "data.table"){
if (is.null(p_attributes)) p_attributes <- p_attributes(.Object)
if (!all(p_attributes %in% p_attributes(.Object)))
stop("Not all p_attributes provided are available.")
p_attribute_list <- lapply(
setNames(p_attributes, p_attributes),
function(p_attr){
if (verbose)
cli_progress_step(sprintf("decoding p-attribute: %s", p_attr))
get_token_stream(.Object, p_attribute = p_attr, decode = decode)
}
)
max_cpos <- size(.Object) - 1L
if (is.null(s_attributes)) s_attributes <- s_attributes(.Object)
if (length(s_attributes) > 0L){
if (!all(s_attributes %in% s_attributes(.Object)))
stop("Not all s_attributes provided are available.")
}
if (length(s_attributes) > 0L){
s_attribute_list <- lapply(
setNames(s_attributes, s_attributes),
function(s_attr){
if (verbose)
cli_progress_step(sprintf("decoding s-attribute: %s", s_attr))
struc <- cl_cpos2struc(
corpus = .Object@corpus,
registry = .Object@registry_dir,
s_attribute = s_attr,
cpos = 0L:max_cpos
)
if (decode && s_attr_has_values(s_attr, x = .Object)){
str <- cl_struc2str(
corpus = .Object@corpus,
registry = .Object@registry_dir,
s_attribute = s_attr,
struc = struc
)
Encoding(str) <- encoding(.Object)
return(as.nativeEnc(str, from = encoding(.Object)))
} else {
return(struc)
}
}
)
} else {
s_attribute_list <- list()
}
if (verbose) cli_progress_step("assembling data.table")
combined_list <- c(
list(cpos = 0L:max_cpos),
p_attribute_list,
s_attribute_list
)
y <- data.table::as.data.table(combined_list)
} else if (to == "Annotation"){
y <- as(.Object, "Annotation")
} else if (to == "AnnotatedPlainTextDocument"){
y <- as.AnnotatedPlainTextDocument(
x = as(.Object, "subcorpus"), # fn requires cpos slot
p_attributes = p_attributes,
s_attributes = s_attributes,
mw = mw,
stoplist = stoplist,
verbose = verbose
)
}
y
})
#' @exportMethod decode
#' @rdname decode
setMethod("decode", "character", function(.Object, to = c("data.table", "Annotation"), s_attributes = NULL, p_attributes = NULL, decode = TRUE, verbose = TRUE){
decode(corpus(.Object), to = to, s_attributes = s_attributes, p_attributes = p_attributes, decode = decode, verbose = verbose)
})
#' @exportMethod decode
#' @rdname decode
setMethod("decode", "slice", function(.Object, to = c("data.table", "Annotation", "AnnotatedPlainTextDocument"), s_attributes = NULL, p_attributes = NULL, mw = NULL, stoplist = NULL, decode = TRUE, verbose = TRUE){
if (to == "data.table"){
if (is.null(p_attributes)) p_attributes <- p_attributes(.Object)
if (!all(p_attributes %in% p_attributes(.Object)))
stop("Not all p_attributes provided are available.")
y <- data.table(cpos = ranges_to_cpos(.Object@cpos))
setkeyv(y, cols = "cpos") # col cpos used repeatedly to merge s-attributes
for (p_attr in p_attributes){
if (verbose) cli_progress_step("decoding p_attribute {.val {p_attr}}")
y[, (p_attr) := get_token_stream(.Object, p_attribute = p_attr, decode = decode)]
if (verbose) cli_progress_done()
}
if (is.null(s_attributes)){
if (verbose)
cli_alert_info(
"argument `s_attributes` missing - decode all s-attributes"
)
s_attributes <- s_attributes(.Object)
}
if (length(s_attributes) > 0L){
if (!all(s_attributes %in% s_attributes(.Object)))
stop("Not all s_attributes provided are available.")
for (i in 1L:length(s_attributes)){
if (verbose)
cli_progress_step("decoding s_attribute {.val {s_attributes[i]}}")
# Tested: strucs, regions and corpus positions will be identical if
# i > 2 and current s-attribute is sibling of previous sibling. But
# the cost of learning whether s-attributes are siblings consumes the
# performance gain, so we get strucs, regions and cpos anew.
struc_matrix <- RcppCWB::region_matrix_to_struc_matrix(
corpus = .Object@corpus,
s_attribute = s_attributes[i],
registry = .Object@registry_dir,
region_matrix = .Object@cpos
)
strucs <- RcppCWB::ranges_to_cpos(struc_matrix)
if (length(strucs) == 0L) next
regions <- RcppCWB::get_region_matrix(
corpus = .Object@corpus,
registry = .Object@registry_dir,
s_attribute = s_attributes[i],
strucs = strucs
)
cpos <- RcppCWB::ranges_to_cpos(regions)
decode <- decode && s_attr_has_values(s_attributes[i], x = .Object)
if (decode){
str <- cl_struc2str(
corpus = .Object@corpus,
registry = .Object@registry_dir,
s_attribute = s_attributes[i],
struc = strucs
)
Encoding(str) <- encoding(.Object)
}
values <- data.table(
cpos = cpos,
attr = do.call(
c,
mapply(
rep,
x = if (decode) str else strucs,
times = (regions[,2] - regions[,1]) + 1,
SIMPLIFY = FALSE
)
)
)
setnames(values, old = "attr", new = s_attributes[i])
y <- values[y, on = "cpos"]
if (verbose) cli_progress_done()
}
setcolorder(y, neworder = c("cpos", p_attributes, s_attributes))
}
} else if (to == "Annotation"){
y <- as(.Object, "Annotation")
} else if (to == "AnnotatedPlainTextDocument"){
return(callNextMethod())
} else {
stop("not implemented")
}
y
})
#' @rdname decode
setMethod("decode", "partition", function(.Object, to = "data.table", s_attributes = NULL, p_attributes = NULL, decode = TRUE, verbose = TRUE){
callNextMethod()
})
#' @exportMethod decode
#' @rdname decode
setMethod("decode", "subcorpus", function(.Object, to = c("data.table", "Annotation", "AnnotatedPlainTextDocument"), s_attributes = NULL, p_attributes = NULL, mw = NULL, stoplist = NULL, decode = TRUE, verbose = TRUE){
if (to == "AnnotatedPlainTextDocument"){
return(
as.AnnotatedPlainTextDocument(
x = .Object,
p_attributes = p_attributes,
s_attributes = s_attributes,
mw = mw,
stoplist = stoplist,
verbose = verbose
)
)
}
callNextMethod()
})
#' @details If \code{.Object} is an \code{integer} vector, it is assumed to be a
#' vector of integer ids of p-attributes. The \code{decode}-method will
#' translate token ids to string values as efficiently as possible. The
#' approach taken will depend on the corpus size and the share of the corpus
#' that is to be decoded. To decode a large number of integer ids, it is more
#' efficient to read the lexicon file from the data directory directly and to
#' index the lexicon with the ids rather than relying on
#' \code{RcppCWB::cl_id2str}. The internal decision rule is to use the lexicon
#' file when the corpus is larger than 10 000 000 million tokens and more than
#' 5 percent of the corpus are to be decoded. The encoding of the
#' \code{character} vector that is returned will be the coding of the locale
#' (usually ISO-8859-1 on Windows, and UTF-8 on macOS and Linux machines).
#' @param boost A length-one \code{logical} value, whether to speed up decoding
#' a long vector of token ids by directly by reading in the lexion file from
#' the data directory of a corpus. If \code{NULL} (default), the internal
#' decision rule is that \code{boost} will be \code{TRUE} if the corpus is
#' larger than 10 000 000 million tokens and more than 5 percent of the corpus
#' are to be decoded.
#' @param corpus A CWB indexed corpus, either a length-one \code{character} vector,
#' or a \code{corpus} object.
#' @exportMethod decode
#' @rdname decode
#' @examples
#'
#' # decode vector of token ids
#' y <- decode(0:20, corpus = "GERMAPARLMINI", p_attributes = "word")
#' @importFrom stringi stri_encode
setMethod("decode", "integer", function(.Object, corpus, p_attributes, boost = NULL){
stopifnot(
length(corpus) == 1L,
length(p_attributes) == 1L,
is.character(p_attributes)
)
corpus <- if (is.character(corpus)) corpus(corpus) else corpus
if (!inherits(corpus, "corpus"))
stop("Argument 'corpus' is required to be a corpus object.")
if (is.null(boost)){
boost <- if (corpus@size > 10000000L && length(cpos) >= (corpus@size * 0.05)) TRUE else FALSE
}
if (isTRUE(boost)){
lexfile <- fs::path(corpus@data_dir, sprintf("%s.lexicon", p_attributes))
lexicon <- readBin(con = lexfile, what = character(), n = file.info(lexfile)$size)
Encoding(lexicon) <- corpus@encoding
if (!identical(corpus@encoding, encoding())){
# lexicon <- stringi::stri_encode(lexicon, from = corpus@encoding, to = encoding()) # as.locale
lexicon <- iconv(lexicon, from = corpus@encoding, to = encoding())
Encoding(lexicon) <- encoding()
}
y <- lexicon[.Object + 1L]
} else if (isFALSE(boost)){
y <- RcppCWB::cl_id2str(
corpus = corpus@corpus, registry = corpus@registry_dir,
p_attribute = p_attributes, id = .Object
)
Encoding(y) <- corpus@encoding
if (!identical(corpus@encoding, encoding())){
# y <- stringi::stri_encode(y, from = corpus@encoding, to = encoding())
y <- iconv(y, from = corpus@encoding, to = encoding())
Encoding(y) <- encoding()
}
}
y
})
#' @details The \code{decode}-method for \code{data.table} objects will decode
#' token ids (column '`p-attribute`_id'), adding the corresponding string as a
#' new column. If a column "cpos" with corpus positions is present, ids are
#' derived for the corpus positions given first. If the \code{data.table}
#' neither has a column "cpos" nor columns with token ids (i.e. colummn name
#' ending with "_id"), the input \code{data.table} is returned unchanged. Note
#' that columns are added to the \code{data.table} in an in-place operation to
#' handle memory parsimoniously.
#' @examples
#' dt <- data.table::data.table(cpos = cpos("GERMAPARLMINI", query = "Liebe")[,1])
#' decode(dt, corpus = "GERMAPARLMINI", p_attributes = c("word", "pos"))
#' y <- dt[, .N, by = c("word", "pos")]
#' @rdname decode
setMethod("decode", "data.table", function(.Object, corpus, p_attributes){
corpus <- if (is.character(corpus)) corpus(corpus) else corpus
if (!inherits(corpus, "corpus")) stop("Argument 'corpus' is required to be a corpus object.")
if (isFALSE(all(p_attributes %in% p_attributes(corpus)))){
stop("At least one p-attribute is not available.")
}
if ("cpos" %in% colnames(.Object)){
for (p_attr in p_attributes){
p_attr_id <- paste(p_attr, "id", sep = "_")
ids <- cpos2id(corpus, p_attribute = p_attr, cpos = .Object[["cpos"]])
.Object[, (p_attr_id) := ids]
}
}
for (p_attr in p_attributes){
p_attr_id <- paste(p_attr, "id", sep = "_")
if (p_attr_id %in% colnames(.Object)){
.Object[, (p_attr) := decode(.Object[[p_attr_id]], corpus = corpus, p_attributes = p_attr)]
}
}
.Object
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.