Nothing
#' @include partition.R S4classes.R
NULL
#' Get terms in `partition` or `corpus`.
#'
#' @param x A `corpus`, `partition` or `subcorpus` object, or a length-one
#' `character` with a corpus id.
#' @param p_attribute The p-attribute to for which to retrieve results
#' (length-one `character` vector).
#' @param regex Regular expression(s) to filter results (`character` vector).
#' @param robust A `logical` value, whether to check for potential failures.
#' @exportMethod terms
#' @docType methods
#' @name terms
#' @docType methods
#' @importFrom RcppCWB region_matrix_to_ids
#' @examples
#' use(pkg = "RcppCWB", corpus = "REUTERS")
#'
#' r <- partition("REUTERS", id = "144")
#' words <- terms(r, "word")
#' terms(r, p_attribute = "word", regex = ".*il.*")
#' @rdname terms
#' @aliases terms,slice-method terms,partition-method
setMethod("terms", "partition", function(x, p_attribute, regex = NULL){
# ensure that input is correct
stopifnot(is.character(p_attribute))
if (length(p_attribute) > 1) stop("cannot process more than one p-attribute")
if (!is.null(regex)) stopifnot(is.character(regex))
# if count has been performed for partition use stat table
if (identical(p_attribute, x@p_attribute)){
y <- x@stat[[p_attribute]]
} else {
ids <- region_matrix_to_ids(
corpus = x@corpus, registry = x@registry_dir,
p_attribute = p_attribute,
matrix = x@cpos
)
ids_unique <- unique(ids)
y <- cl_id2str(
corpus = x@corpus,
p_attribute = p_attribute,
id = ids_unique,
registry = x@registry_dir
)
Encoding(y) <- x@encoding
}
y <- enc2utf8(y)
if (!is.null(regex)) {
y <- unlist(lapply(regex, function(r) grep(r, y, value = TRUE)))
}
y
})
#' @rdname terms
setMethod("terms", "subcorpus", function(x, p_attribute, regex = NULL){
terms(x = as(x, "partition"), p_attribute = p_attribute, regex = regex)
})
#' @rdname terms
setMethod("terms", "corpus", function(x, p_attribute, regex = NULL, robust = FALSE){
stopifnot(
length(x) == 1L,
is.character(p_attribute),
length(p_attribute) == 1L,
is.logical(robust)
)
if (!is.null(regex)) stopifnot(is.character(regex))
terms_total <- cl_lexicon_size(
corpus = x@corpus,
p_attribute = p_attribute,
registry = x@registry_dir
)
ids <- 0L:(terms_total - 1L)
str <- cl_id2str(
corpus = x@corpus,
p_attribute = p_attribute,
id = ids,
registry = x@registry_dir
)
Encoding(str) <- x@encoding
y <- as.nativeEnc(str, from = x@encoding)
if (robust != FALSE){
if (robust){
if (length(y) != length(unique(y))){
warning("there may be terms causing issues")
strCount <- table(y)
villains <- names(which(strCount > 1))
}
} else if (is.character(robust)) {
villains <- robust
}
for (villain in villains){
warning("this is a villain: ", villain)
villain_pos <- which(villain == y)
for (i in 1L:length(villain_pos)){
if (i >= 2L) y[villain_pos[i]] <- paste(villain, i, sep = "_")
}
}
}
if (!is.null(regex)) {
y <- unlist(lapply(regex, function(r) grep(r, y, value = TRUE)))
}
y
})
#' @rdname terms
setMethod("terms", "character", function(x, p_attribute, regex = NULL, robust = FALSE){
terms(
x = corpus(x),
p_attribute = p_attribute,
regex = regex,
robust = robust
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.