Nothing
#' @include textstat.R features.R S4classes.R
NULL
#' @rdname context-class
setMethod("sample", "context", function(x, size){
hits_unique <- unique(x@cpos[["match_id"]])
if (size > length(hits_unique)){
warning("argument size exceeds number of hits, returning original object")
return(x)
}
x@cpos <- x@cpos[which(x@cpos[["match_id"]] %in% sample(hits_unique, size = size))]
x@count <- as.integer(size)
x@size <- length(which(x@cpos[["position"]] != 0))
x
})
setAs(
from = "context", to = "matrix",
def = function(from){
x <- copy(from@cpos)
setorderv(x, cols = c("cpos", "match_id"))
x[, "direction" := sign(x[["position"]])]
dt <- rbindlist(lapply(
c(-1L, 0L, 1L),
function(dir){
x[x[["direction"]] == dir][, list(
cpos_left = .SD$cpos[1],
cpos_right = .SD$cpos[nrow(.SD)]),
by = "match_id"
]
}
))
as.matrix(setorderv(dt, cols = "cpos_left")[, "match_id" := NULL])
}
)
#' @include partition.R partition_bundle.R
NULL
#' @exportMethod context
#' @docType methods
#' @rdname context-method
setGeneric("context", function(.Object, ...) standardGeneric("context") )
#' Analyze context of a node word.
#'
#' Retrieve the word context of a token, optionally checking for boundaries of a
#' XML region.
#'
#' For formulating the query, CPQ syntax may be used (see
#' examples). Statistical tests available are log-likelihood, t-test, pmi.
#'
#' @param .Object a partition or a partition_bundle object
#' @param query A query, which may by a character vector or a CQP query.
#' @param cqp defaults to is.cqp-function, or provide TRUE/FALSE
#' @param check A `logical` value, whether to check validity of CQP query using
#' `check_cqp_query`.
#' @param p_attribute The p-attribute of the query.
#' @param region An s-attribute, given by a length-one `character` vector.
#' The context of query matches will be expanded to the left and right
#' boundary of the region where the match is located. If arguments `left` and
#' `right` are > 1, the left and right boundaries of the respective number of
#' regions will be identified.
#' @param boundary If provided, a length-one character vector specifying a
#' s-attribute. It will be checked that corpus positions do not extend beyond
#' the region defined by the s-attribute.
#' @param left A single `integer` value defining the number of tokens to the
#' left of the query match to include in the context. Advanced usage: (a) If
#' `left` is a length-one `character` vector stating an s-attribute, the
#' context will be expanded to the (left) boundary of the region where the
#' match occurs. (b) If `left` is a named length-one `integer` vector, this
#' value is the number regions of the structural attribute referred to by the
#' vector's name to the left of the query match that are included in the
#' context.
#' @param right A single `integer` value, a length-one `character` vector or a
#' named length-one `integer` value, with equivalent effects to argument
#' `left`.
#' @param stoplist Exclude match for query if stopword(s) is/are are present in
#' context. See positivelist for further explanation.
#' @param positivelist A `character` vector or `numeric`/`integer` vector:
#' include a query hit only if token in positivelist is present. If
#' positivelist is a `character` vector, it may include regular expressions
#' (see parameter regex).
#' @param regex A `logical` value, defaults to `FALSE` - whether `stoplist`
#' and/or `positivelist` are regular expressions.
#' @param count logical
#' @param mc Whether to use multicore; if `NULL` (default), the function will get
#' the value from the options.
#' @param verbose Report progress? A `logical` value, defaults to `TRUE`.
#' @param progress A `logical` value, whether to show progress bar.
#' @param ... Further parameters.
#' @return depending on whether a `partition` or a `partition_bundle` serves as
#' input, the return will be a context object, or a `context_bundle` object.
#' Note that the number of objects in the `context_bundle` may differ from the
#' number of objects in the input `bundle` object: `NULL` objects that result
#' if no hit is obtained are dropped.
#' @author Andreas Blaette
#' @aliases context,slice-method as.matrix,context_bundle-method context,partition-method
#' @examples
#' use("polmineR")
#' p <- partition("GERMAPARLMINI", interjection = "speech")
#' y <- context(p, query = "Integration", p_attribute = "word")
#' y <- context(p, query = "Integration", p_attribute = "word", positivelist = "Bildung")
#' y <- context(
#' p, query = "Integration", p_attribute = "word",
#' positivelist = c("[aA]rbeit.*", "Ausbildung"), regex = TRUE
#' )
#' @exportMethod context
#' @rdname context-method
#' @name context
#' @docType methods
setMethod("context", "slice", function(
.Object, query, cqp = is.cqp, check = TRUE,
left = getOption("polmineR.left"),
right = getOption("polmineR.right"),
p_attribute = getOption("polmineR.p_attribute"),
region = NULL, boundary = NULL,
stoplist = NULL, positivelist = NULL, regex = FALSE,
count = TRUE,
mc = getOption("polmineR.mc"), verbose = FALSE, progress = TRUE,
...
) {
if (is.numeric(left) && !is.integer(left))
left <- setNames(as.integer(left), names(left))# input may be numeric
if (is.numeric(right) && !is.integer(right))
right <- setNames(as.integer(right), names(left)) # input may be numeric
# get regions for query matches
.message("getting corpus positions", verbose = verbose)
regions <- cpos(
.Object = .Object, query = query, p_attribute = p_attribute[1], cqp = cqp,
check = check
)
if (is.null(regions)){
warning('No hits for query ', query, ' (returning NULL)')
return( invisible(NULL) )
} else {
.message("number of hits:", nrow(regions), verbose = verbose)
}
colnames(regions) <- c("hit_cpos_left", "hit_cpos_right")
ctxt <- context(
.Object = regions,
left = left, right = right,
p_attribute = p_attribute, region = region,
boundary = boundary,
corpus = .Object@corpus,
registry = .Object@registry_dir
)
ctxt@query <- query
ctxt@p_attribute <- p_attribute
ctxt@corpus <- .Object@corpus
ctxt@encoding <- .Object@encoding
ctxt@data_dir <- .Object@data_dir
ctxt@registry_dir <- .Object@registry_dir
ctxt@info_file <- .Object@info_file
ctxt@template <- .Object@template
ctxt@partition <- as(.Object, "partition")
ctxt@size_partition <- as.integer(.Object@size)
ctxt@boundary <- if (!is.null(boundary)) boundary else character()
# add decoded tokens (ids at this stage)
ctxt <- enrich(ctxt, p_attribute = p_attribute, decode = FALSE, verbose = verbose)
# generate positivelist/stoplist with ids and apply it
if (!is.null(positivelist))
ctxt <- trim(ctxt, positivelist = positivelist, regex = regex, verbose = verbose)
if (is.null(ctxt)) return(NULL)
if (!is.null(stoplist))
ctxt <- trim(ctxt, stoplist = stoplist, regex = regex, verbose = verbose)
if (is.null(ctxt)) return(NULL)
.message("generating contexts", verbose = verbose)
ctxt@size <- nrow(ctxt@cpos)
ctxt@size_match <- as.integer(sum(regions[,2] - regions[,1]) + nrow(regions))
ctxt@size_coi <- as.integer(ctxt@size) - ctxt@size_match
ctxt@size_ref <- as.integer(ctxt@size_partition - ctxt@size_coi - ctxt@size_match)
ctxt@size_partition <- size(.Object)
ctxt@count <- length(unique(ctxt@cpos[["match_id"]]))
# put together raw stat table
if (count) ctxt <- enrich(ctxt, stat = TRUE, verbose = verbose)
ctxt
})
#' @rdname context-method
setMethod("context", "partition", function(
.Object, query, cqp = is.cqp, check = TRUE,
left = getOption("polmineR.left"),
right = getOption("polmineR.right"),
p_attribute = getOption("polmineR.p_attribute"),
region = NULL,
boundary = NULL,
stoplist = NULL, positivelist = NULL, regex = FALSE,
count = TRUE,
mc = getOption("polmineR.mc"), verbose = FALSE, progress = TRUE,
...
) callNextMethod()
)
#' @rdname context-method
setMethod("context", "subcorpus", function(
.Object, query, cqp = is.cqp, check = TRUE,
left = getOption("polmineR.left"),
right = getOption("polmineR.right"),
p_attribute = getOption("polmineR.p_attribute"),
region = NULL,
boundary = NULL,
stoplist = NULL, positivelist = NULL, regex = FALSE,
count = TRUE,
mc = getOption("polmineR.mc"), verbose = FALSE, progress = TRUE,
...
) callNextMethod()
)
#' @details If `.Object` is a `matrix`, the `context`-method will call
#' `RcppCWB::region_matrix_context()`, the worker behind the
#' `context()`-method.
#' @param corpus A length-one `character` vector stating a corpus ID.
#' @param registry The registry directory with the registry file for `corpus`.
#' @rdname context-method
#' @importFrom data.table between
#' @importFrom RcppCWB region_matrix_context corpus_s_attributes
setMethod("context", "matrix", function(.Object, corpus, registry = Sys.getenv("CORPUS_REGISTRY"), left, right, p_attribute, region = NULL, boundary = NULL){
if (ncol(.Object) != 2L) stop("context,matrix-method: .Object is required to be a two-column matrix")
stopifnot(is.character(corpus), length(corpus) == 1L)
if (inherits(left, "numeric"))
left <- setNames(as.integer(left), nm = names(left))
if (inherits(right, "numeric"))
right <- setNames(as.integer(right), nm = names(right))
if (is.integer(left) && is.integer(right)){
if (is.null(names(left)) && is.null(names(left))){
if (is.null(region)){
s_attr <- NULL
} else {
s_attrs <- corpus_s_attributes(corpus = corpus, registry = registry)
if (region %in% s_attrs){
s_attr <- region
} else {
warning(sprintf("s-attribute '%s' not defined", region))
s_attr <- NULL
}
}
} else {
s_attr <- unique(c(names(left), names(right)))
if (length(s_attr) > 1L) stop("Only one single s-attribute allowed.")
}
} else if (is.character(left) && is.character(right)){
s_attr <- unique(left, right)
if (length(s_attr) > 1L) stop("Only one single s-attribute allowed.")
left <- 0L
right <- 0L
}
cpos_matrix <- region_matrix_context(
corpus = corpus,
matrix = .Object,
s_attribute = s_attr,
p_attribute = p_attribute[1],
left = left, right = right,
boundary = boundary,
registry = registry
)
cpos_dt <- as.data.table(cpos_matrix)
colnames(cpos_dt) <- c(
"position", "cpos", "match_id", paste(p_attribute[1], "id", sep = "_")
)
# region_matrix_context can only handle one p-attribute.
# Add ids for further p-attributes now
if (length(p_attribute) > 1L){
for (i in 2L:length(p_attribute)){
ids <- cl_cpos2id(
corpus = corpus,
registry = registry,
p_attribute = p_attribute[i],
cpos = cpos_dt[["cpos"]]
)
cpos_dt[, (paste(p_attribute[i], "id", sep = "_")) := ids]
}
}
setcolorder(cpos_dt, c("match_id", "cpos"))
retval <- as(corpus(corpus), "context")
retval@count <- nrow(.Object)
retval@cpos <- cpos_dt
retval@left <- if (is.character(left)) 0L else as.integer(left)
retval@right <- if (is.character(right)) 0L else as.integer(right)
retval@boundary <- if (is.null(boundary)) character() else boundary
retval@p_attribute <- p_attribute
retval
})
#' @rdname context-method
setMethod("context", "corpus", function(
.Object, query, cqp = is.cqp,
p_attribute = getOption("polmineR.p_attribute"),
region = NULL, boundary = NULL,
left = getOption("polmineR.left"), right = getOption("polmineR.right"),
stoplist = NULL, positivelist = NULL, regex = FALSE,
count = TRUE,
mc = getOption("polmineR.mc"), verbose = FALSE, progress = TRUE,
...
){
if ("pAttribute" %in% names(list(...))){
lifecycle::deprecate_warn(
when = "0.8.7",
what = "context(pAttribute)",
with = "context(p_attribute)"
)
p_attribute <- list(...)[["pAttribute"]]
}
if ("sAttribute" %in% names(list(...))){
lifecycle::deprecate_warn(
when = "0.8.7",
what = "context(sAttribute)",
with = "context(boundary)"
)
boundary <- list(...)[["sAttribute"]]
}
p <- as(.Object, "partition")
# There is a potential overhead of performing the count here: When context-method
# is called by kwic, the count is not needed. However, if context is called by
# cooccurrences, it is much, much faster (for one p-attribute) having done the
# count here for the entire corpus.
if (length(p_attribute) == 1L){
p@stat <- count(.Object@corpus, p_attribute = p_attribute, decode = FALSE)@stat
}
context(
p, query = query, cqp = is.cqp,
left = left, right = right,
p_attribute = p_attribute,
region = region, boundary = boundary,
stoplist = stoplist, positivelist = positivelist, regex = regex,
count = count, mc = mc, verbose = verbose, progress = progress
)
})
#' @rdname context-method
setMethod("context", "character", function(
.Object, query, cqp = is.cqp,
p_attribute = getOption("polmineR.p_attribute"),
region = NULL, boundary = NULL,
left = getOption("polmineR.left"), right = getOption("polmineR.right"),
stoplist = NULL, positivelist = NULL, regex = FALSE,
count = TRUE,
mc = getOption("polmineR.mc"), verbose = FALSE, progress = TRUE,
...
){
context(
.Object = corpus(.Object),
query = query,
cqp = cqp,
p_attribute = p_attribute,
region = region, boundary = boundary,
left = left, right = right,
stoplist = stoplist, positivelist = positivelist, regex = regex,
count = count,
mc = mc,
verbose = verbose,
progress = progress,
...
)
})
#' @docType methods
#' @rdname context-method
setMethod("context", "partition_bundle", function(.Object, query, p_attribute, stoplist = NULL, positivelist = NULL, regex = FALSE, verbose = TRUE, ...){
if ("pAttribute" %in% names(list(...))) p_attribute <- list(...)[["pAttribute"]]
# Turn tokens on positivelist into ids once for entire corpus to avoid doing
# this for every single object.
if (!is.null(positivelist) && !is.numeric(positivelist)){
if (regex){
positivelist <- unlist(lapply(
positivelist,
function(x) regex2id(x = .Object, p_attribute = p_attribute, regex = x)
))
} else {
positivelist <- str2id(
x = .Object, p_attribute = p_attribute, str = positivelist
)
}
}
# Same for stoplist.
if (!is.null(stoplist) && !is.numeric(stoplist)){
if (regex){
stoplist <- unlist(lapply(
stoplist,
function(x) regex2id(x = .Object, p_attribute = p_attribute, regex = x)
))
} else {
stoplist <- str2id(
x = .Object, p_attribute = p_attribute, str = stoplist
)
}
}
y <- as(as(.Object, "corpus"), "context_bundle")
y@query <- query
y@p_attribute <- p_attribute
y@objects <- sapply(
.Object@objects,
function(x) {
.message("get context for partition ", x@name, verbose = verbose)
context(
x, query = query, p_attribute = p_attribute,
positivelist = positivelist, stoplist = stoplist,
verbose = FALSE,
...
)
},
simplify = TRUE,
USE.NAMES = TRUE
)
# Remove NULL objects that result of no match has been obtained
for (i in rev(which(sapply(y@objects, is.null)))) y@objects[[i]] <- NULL
y
})
#' @param complete enhance completely
#' @rdname context-method
setMethod("context", "cooccurrences", function(.Object, query, check = TRUE, complete = FALSE){
newObject <- new(
"context",
query = query,
partition = .Object@partition,
size_partition = as.integer(.Object@partition_size),
left = as.integer(.Object@left),
right = as.integer(.Object@right),
p_attribute = .Object@p_attribute,
corpus = .Object@corpus,
registry_dir = .Object@registry_dir,
data_dir = .Object@data_dir,
template = .Object@template,
info_file = .Object@info_file,
encoding = .Object@encoding,
method = .Object@method,
stat = subset(.Object@stat, .Object@stat[, "node"]==query),
call = deparse(match.call()),
size = unique(subset(.Object@stat, .Object@stat[, "node"] == query)[,"size_window"])
)
stop("due to refactoring the context method, this does not work at present")
# if (complete){
# s_attribute <- names(get(newObject@partition, ".GlobalEnv")@s_attributes)[[1]]
# sAttr <- paste(
# newObject@corpus, ".",
# names(get(newObject@partition, ".GlobalEnv")@s_attributes)[[1]],
# sep = ""
# )
# hits <- cpos(
# newObject@query,
# get(newObject@partition, ".GlobalEnv"),
# p_attribute = newObject@p_attribute,
# verbose = FALSE, check = check
# )
# newObject@size <- nrow(hits)
# hits <- cbind(hits, cl_cpos2struc(corpus = newObject@corpus, s_attribute = s_attribute, cpos = hits[,1], registry = registry()))
# newObject@cpos <- .make_context_dt(hits, left = newObject@left, right = newObject@right, corpus = newObject@corpus, s_attribute = sAttr)
# }
newObject
})
#' @include bundle.R
NULL
#' @docType methods
#' @noRd
setMethod("summary", "context_bundle", function(object, top = 3){
sizes_partition <- unname(sapply(object@objects, slot, "size_partition"))
counts <- unname(sapply(object@objects, slot, "count"))
y <- data.frame(count = counts, freq = counts / sizes_partition)
method <- unlist(unique(unname(sapply(object@objects, slot, "method"))))
if (length(method) > 0L){
y <- cbind(
y,
t(
data.frame(lapply(object@objects, function(x) .statisticalSummary(x)$no))
)
)
colnames(y)[3:6] <- c(">10.83", ">7.88", ">6.63", ">3.84")
}
if (top > 0L){
y <- cbind(
y,
t(
data.frame(
lapply(
object@objects,
function(x) x@stat[[object@p_attribute]][1:top]
)
)
)
)
}
y
})
#' @docType methods
#' @noRd
setMethod("show", "context_bundle", function(object){
summary(object)
})
setAs(from = "kwic", to = "context", def = function(from){
new(
"context",
query = "",
count = -1L,
partition = new("partition", size = 0L, stat = data.table()),
size_partition = -1L,
size_match = -1L,
size = -1L,
boundary = character(),
call = "",
left = from@left,
right = from@right,
p_attribute = from@p_attribute,
corpus = from@corpus,
registry_dir = from@registry_dir,
info_file = from@info_file,
template = from@template,
stat = data.table(),
encoding = from@encoding,
cpos = from@cpos
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.