Nothing
#' @include partition.R partition_bundle.R context.R S4classes.R
#' @include features.R
NULL
#' Enrich an object.
#'
#' Methods to enrich objects with additional (statistical) information. The
#' methods are documented with the classes to which they adhere. See the
#' references in the \code{seealso}-section.
#' @param .Object a `partition`, `partition_bundle` or comp object
#' @param ... further parameters
#' @aliases enrich enrich-method
#' @docType methods
#' @rdname enrich-method
#' @seealso The enrich method is defined for the following classes:
#' "partition", (see \code{\link{partition_class}}),
#' "partition_bundle" (see \code{\link{partition_bundle-class}}),
#' "kwic" (see \code{\link{kwic-class}}), and
#' "context" (see \code{\link{context-class}}). See the linked documentation
#' to learn how the enrich method can be applied to respective objects.
setGeneric("enrich", function(.Object, ...){standardGeneric("enrich")})
#' @details The \code{enrich}-method will add a count of tokens defined by argument
#' \code{p_attribute} to slot \code{stat} of the \code{partition} object.
#' @param mc \code{logical} or, if numeric, providing the number of cores
#' @param decode \code{logical} value, whether to decode token ids into strings when performing count
#' @exportMethod enrich
#' @docType methods
#' @rdname partition_class
setMethod("enrich", "partition", function(.Object, p_attribute = NULL, decode = TRUE, verbose = TRUE, mc = FALSE, ...){
if ("pAttribute" %in% names(list(...))) p_attribute <- list(...)[["pAttribute"]]
if (!is.null(p_attribute)) {
stopifnot(
is.character(p_attribute),
length(p_attribute) <= 2,
all(p_attribute %in% p_attributes(.Object))
)
.message('getting counts for p-attribute(s):', paste(p_attribute, collapse = ", "), verbose = verbose)
.Object@stat <- count(.Object = .Object, p_attribute = p_attribute, decode = decode, mc = mc, verbose = verbose)@stat
.Object@p_attribute <- p_attribute
}
.Object
})
#' @details The `enrich()` method will fill the slot `stat` of the `partition`
#' objects within the bundle with a count for the designated p-attributes. If
#' `.Object` is a `subcorpus_bundle`, the output class will be a
#' `partition_bundle`.
#' @param p_attribute A `character` vector with p-attribute(s) for counting.
#' @param decode A `logical` value, whether to turn token ids into decoded
#' strings.
#' @param progress A `logical` value, whether to show progress bar.
#' @param verbose A `logical` value, whether to show progress messages.
#' @exportMethod enrich
#' @docType methods
#' @rdname partition_bundle-class
#' @importFrom cli cli_process_start cli_process_done col_blue
setMethod("enrich", "partition_bundle", function(.Object, p_attribute, decode = TRUE, verbose = FALSE){
m <- do.call(rbind, lapply(.Object@objects, slot, name = "cpos"))
ids <- lapply(
p_attribute,
function(p_attr){
if (verbose)
cli_process_start(
sprintf("get ids for p-attribute %s", col_blue(p_attr))
)
y <- RcppCWB::region_matrix_to_ids(
corpus = .Object@corpus, registry = .Object@registry_dir,
p_attribute = p_attr, matrix = m
)
if (verbose) cli_process_done()
y
}
)
names(ids) <- paste(p_attribute, "id", sep = "_")
if (verbose) cli_process_start("create temporary table with doc ids")
dt <- setDT(ids)
doc_id <- unlist(mapply(
rep,
x = 1L:length(.Object@objects),
times = sapply(.Object@objects, slot, name = "size")
))
dt[, "doc_id" := doc_id]
if (verbose) cli_process_done()
if (verbose) cli_process_start("perform count")
cnt <- dt[, .N, by = c("doc_id", paste(p_attribute, "id", sep = "_"))]
rm(dt)
setnames(cnt, old = "N", new = "count")
if (verbose) cli_process_done()
if (decode){
for (p_attr in p_attribute){
if (verbose)
cli_process_start(sprintf("decode p-attribute %s", col_blue(p_attr)))
str <- id2str(
x = .Object,
p_attribute = p_attr,
id = cnt[[paste(p_attr, "id", sep = "_")]]
)
cnt[, (p_attr) := as.nativeEnc(x = str, from = .Object@encoding)]
if (verbose) cli_process_done()
}
}
setcolorder(
cnt,
neworder = c(p_attribute, paste(p_attribute, "id", sep = "_"))
)
if (verbose) cli_process_start("split into tables")
cnt_list <- split(x = cnt, by = "doc_id", keep.by = FALSE)
rm(cnt)
if (verbose) cli_process_done()
if (verbose) cli_process_start("assign count tables to input object")
.Object@objects <- mapply(
function(a, b){a@stat <- b; a@p_attribute <- p_attribute; a},
.Object@objects,
cnt_list
)
if (verbose) cli_process_done()
.Object
})
#' @rdname partition_bundle-class
#' @exportMethod enrich
setMethod("enrich", "subcorpus_bundle", function(.Object, p_attribute, decode = TRUE, verbose = FALSE){
cli_alert_info("coercion to `partition_bundle` class")
# coercion ensures that objects in slot objects inherit from partition
enrich(
as(.Object, "partition_bundle"),
p_attribute = p_attribute,
decode = decode,
verbose = verbose
)
})
#' @details The \code{enrich} method is used to generate the actual output for
#' the kwic method. If param \code{table} is \code{TRUE}, corpus positions
#' will be turned into a \code{data.frame} with the concordance lines. If
#' param \code{s_attributes} is a character vector with s-attributes, the
#' respective s-attributes will be added as columns to the table with
#' concordance lines.
#' @rdname kwic-class
#' @examples
#' # enrich kwic object
#' i <- corpus("GERMAPARLMINI") %>%
#' kwic(query = "Integration") %>%
#' enrich(s_attributes = c("date", "speaker", "party"))
setMethod("enrich", "kwic", function(.Object, s_attributes = NULL, extra = NULL, table = FALSE, ...){
if ("meta" %in% names(list(...))) s_attributes <- list(...)[["meta"]]
if (!is.null(extra)){
table <- TRUE # it will be necessary to regenerate the table
stopifnot(is.integer(extra) || is.numeric(extra))
if (is.numeric(extra)) extra <- as.integer(extra)
.fn_left <- function(.SD){
cpos_min <- min(.SD[["cpos"]])
position_min <- min(.SD[["position"]])
hit <- .SD[["match_id"]][1]
list(
cpos = (cpos_min - extra):(cpos_min - 1L),
position = (position_min - extra):(position_min - 1L),
direction = -2L
)
}
.fn_right <- function(.SD){
cpos_max <- max(.SD[["cpos"]])
position_max <- max(.SD[["position"]])
hit <- .SD[["match_id"]][1]
list(
cpos = (cpos_max + 1L):(cpos_max + extra),
position = (position_max + 1L):(position_max + extra),
direction = 2L
)
}
dt_left <- .Object@cpos[, .fn_left(.SD), by = "match_id", .SDcols = 1L:ncol(.Object@cpos)]
dt_right <- .Object@cpos[, .fn_right(.SD), by = "match_id", .SDcols = 1L:ncol(.Object@cpos)]
dt <- rbindlist(list(.Object@cpos, dt_left, dt_right), use.names = TRUE, fill = TRUE)
setkeyv(x = dt, cols = c("match_id", "cpos"))
setorderv(x = dt, cols = "cpos")
corpus_size <- RcppCWB::cl_attribute_size(
corpus = .Object@corpus, registry = .Object@registry_dir,
attribute = p_attributes(.Object), attribute_type = "p"
)
.Object@cpos <- dt[cpos >= 0L][cpos <= (corpus_size - 1L)]
token_id <- paste(p_attributes(.Object), "id", sep = "_")
word_id_na <- is.na(.Object@cpos[[token_id]])
word_id_na_index <- which(word_id_na)
ids_na <- cpos2id(
.Object, p_attribute = p_attributes(.Object),
cpos = .Object@cpos[["cpos"]][word_id_na]
)
str_na <- RcppCWB::cl_id2str(
corpus = .Object@corpus, registry = .Object@registry_dir,
p_attribute = p_attributes(.Object), id = ids_na
)
str_na <- as.nativeEnc(str_na, from = .Object@encoding)
.Object@cpos[word_id_na_index, (token_id) := ids_na]
.Object@cpos[word_id_na_index, (p_attributes(.Object)) := str_na]
}
if (table){
if (nrow(.Object@cpos) > 0L){
.fn <- function(.SD) paste(.SD[[.Object@p_attribute[1]]], collapse = " ")
table_ext <- .Object@cpos[, .fn(.SD), by = c("match_id", "direction"), with = TRUE]
.Object@stat <- dcast(data = table_ext, formula = match_id ~ direction, value.var = "V1")
setnames(.Object@stat, old = "0", new = "node")
# columns are renamed one at a time to cover the special case when either the
# left or the right context are (deliberately) empty
if ("-2" %in% colnames(.Object@stat)) setnames(.Object@stat, old = "-2", new = "left_extra")
if ("-1" %in% colnames(.Object@stat)) setnames(.Object@stat, old = "-1", new = "left")
if ("1" %in% colnames(.Object@stat)) setnames(.Object@stat, old = "1", new = "right")
if ("2" %in% colnames(.Object@stat)) setnames(.Object@stat, old = "2", new = "right_extra")
} else {
.Object@stat <- data.table(match_id = integer(), left = character(), node = character(), right = character())
}
}
if (length(s_attributes) > 0L){
.Object@metadata <- unique(c(s_attributes, .Object@metadata))
for (s_attr in .Object@metadata){
if (!s_attr %in% colnames(.Object@stat)){
cpos_to_get <- .Object@cpos[which(.Object@cpos[["position"]] == 0)][, .SD[1], by = "match_id", with = TRUE][["cpos"]]
strucs <- cl_cpos2struc(
corpus = .Object@corpus, registry = .Object@registry_dir,
s_attribute = s_attr, cpos = cpos_to_get
)
strucs_invalid <- which(strucs < 0L)
if (length(strucs_invalid) > 0L) strucs[strucs_invalid] <- 0L
struc_values <- cl_struc2str(
corpus = .Object@corpus, registry = .Object@registry_dir,
s_attribute = s_attr, struc = strucs
)
if (length(strucs_invalid) > 0L) struc_values[strucs_invalid] <- ""
.Object@stat[, (s_attr) := as.nativeEnc(struc_values, from = .Object@encoding)]
}
}
setcolorder(x = .Object@stat, neworder = c(
.Object@metadata,
if ("left_extra" %in% colnames(.Object@stat)) "left_extra" else NULL,
if ("left" %in% colnames(.Object@stat)) "left" else NULL,
"node",
if ("right" %in% colnames(.Object@stat)) "right" else NULL,
if ("right_extra" %in% colnames(.Object@stat)) "right_extra" else NULL
)
)
}
.Object
})
#' @details The `enrich()`-method can be used to add additional information to
#' the `data.table` in the `cpos`-slot of a `context`-object.
#'
#' @exportMethod enrich
#' @docType methods
#' @rdname context-class
#' @param s_attribute The s-attribute(s) to add to `data.table` in slot `cpos`.
#' @param p_attribute The p-attribute(s) to add to `data.table` in slot `cpos`.
#' @param decode A `logical` value, whether to convert integer ids to expressive
#' strings.
#' @param stat A `logical` value, whether to generate / update slot `stat` from
#' the `cpos` table.
#' @param verbose A `logical`, whether to be talkative.
#' @importFrom RcppCWB corpus_p_attributes
setMethod("enrich", "context", function(.Object, s_attribute = NULL, p_attribute = NULL, decode = FALSE, stat = FALSE, verbose = TRUE, ...){
if ("pAttribute" %in% names(list(...))){
lifecycle::deprecate_warn(
when = "0.8.7",
what = "enrich(pAttribute)",
with = "enrich(p_attribute)"
)
p_attribute <- list(...)[["pAttribute"]]
}
if ("sAttribute" %in% names(list(...))){
lifecycle::deprecate_warn(
when = "0.8.7",
what = "enrich(sAttribute)",
with = "enrich(s_attribute)"
)
s_attribute <- list(...)[["pAttribute"]]
}
if (!is.null(s_attribute)){
# check that all s-attributes are available
.message("checking that all s-attributes are available", verbose = verbose)
stopifnot(
all(s_attribute %in% corpus_s_attributes(corpus = .Object@corpus, registry = .Object@registry_dir))
)
for (s_attr in s_attribute){
.message("get struc for s-attribute:", s_attr, verbose = verbose)
strucs <- cl_cpos2struc(
corpus = .Object@corpus, registry = .Object@registry_dir,
s_attribute = s_attr, cpos = .Object@cpos[["cpos"]]
)
if (decode == FALSE){
colname_struc <- paste(s_attr, "int", sep = "_")
if (colname_struc %in% colnames(.Object@cpos)){
.message("already present, skipping assignment of column:", colname_struc, verbose = verbose)
} else {
.Object@cpos[, (colname_struc) := strucs]
}
} else {
if (s_attr %in% colnames(.Object@cpos)){
.message("already present, skipping assignment of column:", s_attr, verbose = verbose)
} else {
.message("get string for s-attribute:", s_attr, verbose = verbose)
strings <- cl_struc2str(
corpus = .Object@corpus, registry = .Object@registry_dir,
s_attribute = s_attr, struc = strucs
)
.Object@cpos[, (s_attr) := as.nativeEnc(strings, from = .Object@encoding)]
}
}
}
}
if (!is.null(p_attribute)){
# check that all p-attributes are available
.message("checking that all p-attributes are available", verbose = verbose)
stopifnot(
all(p_attribute %in% corpus_p_attributes(.Object@corpus, registry = .Object@registry_dir))
)
# add ids and decode if requested
for (p_attr in p_attribute){
colname <- paste(p_attr, "id", sep = "_")
if (colname %in% colnames(.Object@cpos)){
.message("already present - skip getting ids for p-attribute:", p_attr, verbose = verbose)
} else {
.message("getting token id for p-attribute:", p_attr, verbose = verbose)
ids <- cpos2id(
x = .Object, p_attribute = p_attr, cpos = .Object@cpos[["cpos"]]
)
.Object@cpos[, (colname) := ids]
}
if (decode){
if (p_attr %in% colnames(.Object@cpos)){
.message("already present - skip getting strings for p-attribute:", p_attr, verbose = verbose)
} else {
.message("decode p-attribute:", p_attr, verbose = verbose)
p_attr_id <- paste(p_attr, "id", sep = "_")
decoded <- id2str(
x = .Object, p_attribute = p_attr, id = .Object@cpos[[p_attr_id]]
)
native <- as.nativeEnc(decoded, from = .Object@encoding)
.Object@cpos <- .Object@cpos[, "word" := native]
# .Object@cpos[, (p_attr_id) := NULL]
}
}
}
}
if (isTRUE(stat)){
msg <- sprintf(
"%s count statistics for slot cpos",
if (nrow(.Object@cpos) == 0L) "generate" else "update"
)
.message(msg, verbose = verbose)
p_attr_id <- paste(.Object@p_attribute, "id", sep = "_")
setkeyv(.Object@cpos, p_attr_id)
cpos_min <- .Object@cpos[which(.Object@cpos[["position"]] != 0)]
.Object@stat <- cpos_min[, .N, by = eval(p_attr_id), with = TRUE]
setnames(.Object@stat, "N", "count_coi")
for (i in seq_along(.Object@p_attribute)){
new_col <- id2str(
x = .Object,
p_attribute = .Object@p_attribute[i],
id = .Object@stat[[p_attr_id[i]]]
)
new_col_native <- as.nativeEnc(new_col, from = .Object@encoding)
.Object@stat[, eval(.Object@p_attribute[i]) := new_col_native]
}
}
.Object
})
#' @export
#' @rdname all-cooccurrences-class
#' @details The `enrich()`-method will add columns 'a_count' and 'b_count' to
#' the `data.table` in the 'stat' slot of the `Cooccurrences` object. If the
#' count for the subcorpus/partition from which the cooccurrences are derived
#' is not yet present, the count is performed first.
setMethod("enrich", "Cooccurrences", function(.Object){
cnt <- if (nrow(.Object@partition@stat) > 0L){
.Object@partition@stat
} else {
count(.Object@partition, p_attribute = .Object@p_attribute, decode = FALSE)@stat
}
setkeyv(cnt, paste(.Object@p_attribute, "id", sep = "_"))
setkeyv(.Object@stat, cols = "a_id")
.Object@stat[, "a_count" := cnt[.Object@stat][["count"]] ]
setkeyv(.Object@stat, cols = "b_id")
.Object@stat[, "b_count" := cnt[.Object@stat][["count"]] ]
invisible(.Object)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.